library(dplyr)
library(readr)
library(DataExplorer)
library(cluster)
library(factoextra)
library(FactoMineR)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)
library(plotly)
knitr::opts_chunk$set(echo = TRUE)
toxins_2023 <- read_csv("C:/Users/wanel/OneDrive/Documents/GitHub/STAT-442-Final-Project/Datasets/2023_us.csv")
## Rows: 77964 Columns: 122
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): 2. TRIFD, 4. FACILITY NAME, 5. STREET ADDRESS, 6. CITY, 7. COUNTY,...
## dbl (84): 1. YEAR, 3. FRS ID, 10. BIA, 12. LATITUDE, 13. LONGITUDE, 22. INDU...
## lgl (7): 24. PRIMARY SIC, 25. SIC 2, 26. SIC 3, 27. SIC 4, 28. SIC 5, 29. S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
toxins_clean <- toxins_2023 %>%
select(-c(24:29)) %>%
mutate(`45. METAL CATEGORY` = ifelse(`45. METAL CATEGORY` == "Metal complound categories", "Metal compound categories",`45. METAL CATEGORY` ))
# Load required libraries
library(dplyr)
library(janitor)
library(ggplot2)
library(tidyr)
library(reshape2)
library(stringr)
library(scales)
# Clean all column names and address multiple patterns like 'x#_', '5_5_1_', and '6_2_m79'
toxins_named <- toxins_2023 %>%
clean_names() %>%
rename_with(~ gsub("^x\\d+_", "", .), everything()) %>% # Remove 'x#_' prefixes
rename_with(~ gsub("^\\d+(_\\d+)*_", "", .), everything()) # Remove leading numeric patterns like '5_5_1_' or '6_2_'
toxins_named <- toxins_named %>%
mutate(chemical = case_when(
str_detect(chemical, "Chromium and Chromium Compounds") ~ "Chromium and Chromium Compounds",
str_detect(chemical, "Sulfuric acid") ~ "Sulfuric acid",
str_detect(chemical, "Nitrate compounds") ~ "Nitrate compounds",
str_detect(chemical, "Barium compounds") ~ "Barium compounds (except for barium sulfate)",
TRUE ~ chemical
))
# Now proceed with your summary and plotting code
chem_release_summary_haz <- toxins_named %>%
filter(clean_air_act_chemical == "YES") %>%
group_by(chemical) %>%
summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_releases)) %>%
head(10)
# For hazardous toxins (p1)
p1 <- ggplot(chem_release_summary_haz, aes(x = reorder(chemical, total_releases), y = total_releases)) +
geom_segment(aes(xend = chemical, yend = 0), color = "skyblue") +
geom_point(size = 4, color = "darkblue") +
coord_flip() +
labs(title = "Top 10 Hazardous Toxins Released",
y = "Total Release",
x = "Chemical") +
theme_minimal() +
theme(
axis.text.y = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid.major.y = element_blank()
) +
scale_y_continuous(labels = scales::comma)
# Now proceed with your summary and plotting code
chem_release_summary_non <- toxins_named %>%
filter(clean_air_act_chemical == "NO") %>%
group_by(chemical) %>%
summarise(total_releases = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_releases)) %>%
head(10)
# For non-hazardous toxins (p2)
p2 <- ggplot(chem_release_summary_non, aes(x = reorder(chemical, total_releases), y = total_releases)) +
geom_segment(aes(xend = chemical, yend = 0), color = "lightgreen") +
geom_point(size = 4, color = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Non-Hazardous Toxins Released",
y = "Total Release",
x = "Chemical") +
theme_minimal() +
theme(
axis.text.y = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid.major.y = element_blank()
) +
scale_y_continuous(labels = scales::comma)
# Summarize release amounts by classification
classification_summary <- toxins_named %>%
group_by(classification) %>%
summarise(total_release = sum(total_releases, na.rm = TRUE)) %>%
arrange(desc(total_release))
# Plot for Classification (Top 10)
p3 <- classification_summary %>%
top_n(10, total_release) %>%
ggplot(aes(x = reorder(classification, total_release), y = total_release, fill = classification)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Toxins Released by Classification",
y = "Total Release",
x = "Classification") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma)
# Pivot data to gather different release methods
release_methods <- toxins_named %>%
select(chemical, fugitive_air, stack_air, water,
underground_cl_i, underground_c_ii_v,
`1a_rcra_c_landfill`, `1b_other_landfills`,
other_disposal) %>%
pivot_longer(cols = fugitive_air:other_disposal,
names_to = "release_method",
values_to = "amount") %>%
group_by(release_method) %>%
summarise(total_amount = sum(amount, na.rm = TRUE))
# Rename release methods for better readability
release_methods <- release_methods %>%
mutate(release_method = case_when(
release_method == "fugitive_air" ~ "Fugitive Air",
release_method == "stack_air" ~ "Stack Air",
release_method == "water" ~ "Water",
release_method == "underground_cl_i" ~ "Underground Class I",
release_method == "underground_c_ii_v" ~ "Underground Class II-V",
release_method == "1a_rcra_c_landfill" ~ "RCRA C Landfill",
release_method == "1b_other_landfills" ~ "Other Landfills",
release_method == "other_disposal" ~ "Other Disposal",
TRUE ~ release_method
))
# Plot for Different Release Methods
p4 <- ggplot(release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Amounts Released by Different Onsite Methods",
x = "Release Method",
y = "Total Amount") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set3")
# List of offsite release methods (excluding total)
offsite_methods <- c("m10", "m41", "m62", "m81", "m82",
"m66", "m67", "m64", "m65", "m73", "m79",
"m90", "m94", "m99")
# Pivot data to gather different offsite release methods
offsite_release_methods <- toxins_named %>%
select(all_of(offsite_methods)) %>%
pivot_longer(cols = everything(),
names_to = "release_method",
values_to = "amount") %>%
group_by(release_method) %>%
summarise(total_amount = sum(amount, na.rm = TRUE))
# Rename release methods for better readability
offsite_release_methods <- offsite_release_methods %>%
mutate(release_method = case_when(
release_method == "m10" ~ "Storage Only",
release_method == "m41" ~ "Solidification/Stabilization",
release_method == "m62" ~ "Wastewater Treatment",
release_method == "m81" ~ "Landfill/Disposal Surface Impoundment",
release_method == "m82" ~ "Land Treatment",
release_method == "m66" ~ "Other Waste Treatment",
release_method == "m67" ~ "Other Waste Treatment",
release_method == "m64" ~ "Other Disposal",
release_method == "m65" ~ "Other Disposal",
release_method == "m73" ~ "Land Treatment",
release_method == "m79" ~ "Other Land Disposal",
release_method == "m90" ~ "Other Off-Site Management",
release_method == "m94" ~ "Transfer to Waste Broker",
release_method == "m99" ~ "Unknown",
TRUE ~ release_method
))
# Plot for Different Offsite Release Methods
p5 <- ggplot(offsite_release_methods, aes(x = reorder(release_method, total_amount), y = total_amount, fill = release_method)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Amounts Released by Different Offsite Methods",
x = "Release Method",
y = "Total Amount") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
axis.text.y = element_text(size = 8)) + # Smaller text for y-axis labels
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set3")
# Calculate proportions for treated, recycled, etc.
proportions <- toxins_named %>%
summarise(
total_treated = sum(treatment_on_site + treatment_off_site, na.rm = TRUE),
total_recycled = sum(recycling_on_site + recycling_off_sit, na.rm = TRUE),
total_energy_recovery = sum(energy_recover_on + energy_recover_of, na.rm = TRUE),
total_release = sum(total_releases, na.rm = TRUE)
) %>%
pivot_longer(everything(), names_to = "category", values_to = "amount") %>%
mutate(proportion = amount / sum(amount),
category = case_when(
category == "total_treated" ~ "Treated",
category == "total_recycled" ~ "Recycled",
category == "total_energy_recovery" ~ "Energy Recovery",
category == "total_release" ~ "Released",
TRUE ~ category
))
# Plot for Proportions
p6 <- ggplot(proportions, aes(x = reorder(category, -proportion), y = proportion, fill = category)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
position = position_stack(vjust = 0.5)) +
labs(title = "Proportion of Toxins Treated, Recycled, Recovered, and Released",
x = "Category",
y = "Proportion") +
theme_minimal() +
theme(legend.position = "none") +
scale_y_continuous(labels = scales::percent) +
scale_fill_brewer(palette = "Set2")
# Calculate on-site and off-site releases
onsite_offsite_summary <- toxins_named %>%
summarise(
on_site_release = sum(on_site_release_total, na.rm = TRUE),
off_site_release = sum(off_site_release_total, na.rm = TRUE)
) %>%
pivot_longer(everything(), names_to = "release_type", values_to = "amount") %>%
mutate(release_type = case_when(
release_type == "on_site_release" ~ "On-Site Release",
release_type == "off_site_release" ~ "Off-Site Release",
TRUE ~ release_type
))
# Calculate percentages
total_release <- sum(onsite_offsite_summary$amount)
onsite_offsite_summary <- onsite_offsite_summary %>%
mutate(percentage = amount / total_release * 100)
# Plot for On-Site vs Off-Site Releases
p7 <- ggplot(onsite_offsite_summary, aes(x = release_type, y = amount, fill = release_type)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = paste0(comma(amount), "\n(", round(percentage, 1), "%)")),
position = position_stack(vjust = 0.5),
size = 4) +
labs(title = "Comparison of On-Site vs Off-Site Releases",
x = NULL, # Remove x-axis label as it's redundant
y = "Total Amount",
fill = "Release Type") +
theme_minimal() +
theme(legend.position = "none", # Remove legend as it's redundant
axis.text.x = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold")) +
scale_y_continuous(labels = comma) +
scale_fill_brewer(palette = "Set2")
ggplotly(p1)
ggplotly(p2)
ggplotly(p3)
ggplotly(p4)
ggplotly(p5)
ggplotly(p6)
ggplotly(p7)
# Data preparation for on-site
toxins_clustered_on_site <- toxins_2023 %>%
select(c(51:53, 55, 56, 58:60, 62:65)) %>% # Select specified columns
na.omit() %>% # Remove rows with missing values
filter(.[[ncol(.)]] != 0) # Remove rows where column 65 has a value of 0
# Scaling the data
toxins_clustered_on_site_scaled <- scale(toxins_clustered_on_site)
# Data preparation for off-site
toxins_clustered_off_site <- toxins_2023 %>%
select(c(66:71, 75, 76, 79:88)) %>% # Select specified columns
na.omit() %>% # Remove rows with missing values
filter(.[[ncol(.)]] != 0) # Remove rows where column 88 has a value of 0
# Scaling the data
toxins_clustered_off_site_scaled <- scale(toxins_clustered_off_site)
sample_size <- 10000
# For on-site data
random_sample_on_site <- toxins_clustered_on_site_scaled[sample(1:nrow(toxins_clustered_on_site_scaled), sample_size), ]
# For off-site data
random_sample_off_site <- toxins_clustered_off_site_scaled[sample(1:nrow(toxins_clustered_off_site_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for On-site Data")
# Off-site data
fviz_nbclust(random_sample_off_site, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Off-site Data")
# Clustering with 2 clusters for on-site
kmeans_on_site <- kmeans(toxins_clustered_on_site_scaled, centers = 2)
kmeans_on_site$centers
## 51. 5.1 - FUGITIVE AIR 52. 5.2 - STACK AIR 53. 5.3 - WATER
## 1 1.263709e-01 -8.520503e-02 -3.475093e-02
## 2 -6.806698e-06 4.589387e-06 1.871785e-06
## 55. 5.4.1 - UNDERGROUND CL I 56. 5.4.2 - UNDERGROUND C II-V
## 1 -2.606665e-02 -8.729126e-03
## 2 1.404024e-06 4.701757e-07
## 58. 5.5.1A - RCRA C LANDFILL 59. 5.5.1B - OTHER LANDFILLS
## 1 -3.235243e-02 -4.222658e-02
## 2 1.742595e-06 2.274444e-06
## 60. 5.5.2 - LAND TREATMENT 62. 5.5.3A - RCRA SURFACE IM
## 1 -1.793947e-02 -4.484244e-03
## 2 9.662714e-07 2.415342e-07
## 63. 5.5.3B - OTHER SURFACE I 64. 5.5.4 - OTHER DISPOSAL
## 1 105.530269207 129.879566229
## 2 -0.005684163 -0.006995686
## 65. ON-SITE RELEASE TOTAL
## 1 125.908039405
## 2 -0.006781768
kmeans_on_site$size
## [1] 3 55697
kmeans_on_site$tot.withinss
## [1] 536806.3
# Display the ratio of between_SS to total_SS
between_tot_ratio_on <- kmeans_on_site$betweenss / kmeans_on_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_on, 1), "%\n")
## Between_SS / Total_SS = 19.7 %
fviz_cluster(kmeans_on_site, data = toxins_clustered_on_site_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
## Too few points to calculate an ellipse
# Clustering with 6 clusters for off-site
kmeans_off_site <- kmeans(toxins_clustered_off_site_scaled, centers = 2)
kmeans_off_site$centers
## 66. 6.1 - POTW - TRNS RLSE 67. 6.1 - POTW - TRNS TRT
## 1 8.71454390 10.68526884
## 2 -0.01672721 -0.02050994
## 68. POTW - TOTAL TRANSFERS 69. 6.2 - M10 70. 6.2 - M41 71. 6.2 - M62
## 1 11.23359393 -4.296281e-02 3.616476185 -0.02662729
## 2 -0.02156243 8.246536e-05 -0.006941679 0.00005111
## 75. 6.2 - M81 76. 6.2 - M82 79. 6.2 - M66 80. 6.2 - M67 81. 6.2 - M64
## 1 2.478007849 5.3915696 -7.387454e-03 -1.828471e-02 11.78490981
## 2 -0.004756436 -0.0103489 1.417992e-05 3.509676e-05 -0.02262066
## 82. 6.2 - M65 83. 6.2 - M73 84. 6.2 - M79 85. 6.2 - M90 86. 6.2 - M94
## 1 4.142540154 -2.391045e-02 -3.897248e-02 4.269182354 -3.737444e-02
## 2 -0.007951438 4.589514e-05 7.480609e-05 -0.008194523 7.173872e-05
## 87. 6.2 - M99 88. OFF-SITE RELEASE TOTAL
## 1 -4.438416e-02 14.73970989
## 2 8.519359e-05 -0.02829228
kmeans_off_site$size
## [1] 50 26049
kmeans_off_site$tot.withinss
## [1] 431884.5
# Display the ratio of between_SS to total_SS
between_tot_ratio_off <- kmeans_off_site$betweenss / kmeans_off_site$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_off, 1), "%\n")
## Between_SS / Total_SS = 8.1 %
fviz_cluster(kmeans_off_site, data = toxins_clustered_off_site_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
# Data preparation for total
toxins_clustered_total <- toxins_2023 %>%
select(c(65, 88, 94, 113, 116, 119, 107, 106, 120)) %>%
na.omit()
# Scaling the data
toxins_clustered_total_scaled <- scale(toxins_clustered_total)
# For total data
random_sample_total <- toxins_clustered_total_scaled[sample(1:nrow(toxins_clustered_total_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_total, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Total Data")
# Clustering with 2 clusters for on-site
kmeans_total <- kmeans(toxins_clustered_total_scaled, centers = 2)
kmeans_total$centers
## 65. ON-SITE RELEASE TOTAL 88. OFF-SITE RELEASE TOTAL
## 1 -0.01578838 -0.01207179
## 2 10.00797559 7.65209425
## 94. OFF-SITE RECYCLED TOTAL 113. 8.2 - ENERGY RECOVER ON
## 1 -0.01913677 -0.01362149
## 2 12.13046288 8.63442099
## 116. 8.5 - RECYCLING OFF SIT 119. PRODUCTION WSTE (8.1-8.7)
## 1 -0.01919015 -0.02149019
## 2 12.16429663 13.62225482
## 107. TOTAL RELEASES 106. 6.2 - TOTAL TRANSFER 120. 8.8 - ONE-TIME RELEASE
## 1 -0.01640607 -0.01579178 -0.01081905
## 2 10.39951763 10.01012788 6.85800245
kmeans_total$size
## [1] 10776 17
kmeans_total$tot.withinss
## [1] 80623.29
# Display the ratio of between_SS to total_SS
between_tot_ratio_total <- kmeans_total$betweenss / kmeans_total$totss * 100
cat("Between_SS / Total_SS =", round(between_tot_ratio_total, 1), "%\n")
## Between_SS / Total_SS = 17 %
fviz_cluster(kmeans_total, data = toxins_clustered_total_scaled,
geom = "point", ellipse.type = "norm",
ggtheme = theme_minimal())
## PCA
# Perform PCA on on-site scaled data
pca_on_site <- prcomp(toxins_clustered_on_site_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_on_site)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6531 1.05258 1.01878 1.00567 1.0010 1.00010 0.99999
## Proportion of Variance 0.2277 0.09233 0.08649 0.08428 0.0835 0.08335 0.08333
## Cumulative Proportion 0.2277 0.32005 0.40655 0.49083 0.5743 0.65768 0.74101
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.99879 0.98111 0.94517 0.50430 2.959e-11
## Proportion of Variance 0.08313 0.08021 0.07445 0.02119 0.000e+00
## Cumulative Proportion 0.82415 0.90436 0.97881 1.00000 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_on_site)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_on_site,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Perform PCA on off-site scaled data
pca_off_site <- prcomp(toxins_clustered_off_site_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_off_site)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5712 1.4361 1.11480 1.00952 1.00709 1.00164 1.00055
## Proportion of Variance 0.1371 0.1146 0.06904 0.05662 0.05635 0.05574 0.05562
## Cumulative Proportion 0.1371 0.2517 0.32077 0.37738 0.43373 0.48947 0.54509
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.00017 1.00006 1.00004 0.9986 0.99842 0.99203 0.99092
## Proportion of Variance 0.05557 0.05556 0.05556 0.0554 0.05538 0.05467 0.05455
## Cumulative Proportion 0.60066 0.65622 0.71178 0.7672 0.82257 0.87724 0.93179
## PC15 PC16 PC17 PC18
## Standard deviation 0.78911 0.77785 3.108e-10 1.812e-10
## Proportion of Variance 0.03459 0.03361 0.000e+00 0.000e+00
## Cumulative Proportion 0.96639 1.00000 1.000e+00 1.000e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_off_site)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_off_site,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Perform PCA on off-site scaled data
pca_total <- prcomp(toxins_clustered_total_scaled, center = TRUE, scale. = TRUE)
# Summary of PCA to see variance explained by each principal component
summary(pca_total)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5130 1.4381 1.3773 1.1816 0.9213 0.61822 0.34114
## Proportion of Variance 0.2544 0.2298 0.2108 0.1551 0.0943 0.04247 0.01293
## Cumulative Proportion 0.2544 0.4841 0.6949 0.8500 0.9443 0.98678 0.99971
## PC8 PC9
## Standard deviation 0.05079 1.9e-11
## Proportion of Variance 0.00029 0.0e+00
## Cumulative Proportion 1.00000 1.0e+00
# Visualize the PCA
# Scree plot to show the variance explained by each principal component
fviz_eig(pca_total)
# Alternatively, you can plot the contributions of each variable to the first two principal components
fviz_pca_var(pca_total,
col.var = "contrib", # color based on contributions of variables
gradient.cols = c("blue", "green", "red"),
repel = TRUE) # repel text to avoid overlap
# Step 1: Extract the first 2 PCs for on-site and off-site data
pca_scores_on_site <- pca_on_site$x[, 1:2] # First 2 PCs for On-Site
pca_scores_off_site <- pca_off_site$x[, 1:2] # First 2 PCs for Off-Site
pca_score_total <- pca_total$x[,1:2]
# Step 2: Combine PCA scores with the original data
pca_on_site <- cbind(toxins_clustered_on_site, pca_scores_on_site)
pca_off_site <- cbind(toxins_clustered_off_site, pca_scores_off_site)
pca_total <- cbind(toxins_clustered_total, pca_score_total)
# Step 3: Scale PCA scores and the original data
pca_on_site_scaled <- scale(pca_on_site[, (ncol(toxins_clustered_on_site) + 1):(ncol(toxins_clustered_on_site) + 2)]) # Scale PCA scores
pca_off_site_scaled <- scale(pca_off_site[, (ncol(toxins_clustered_off_site) + 1):(ncol(toxins_clustered_off_site) + 2)]) # Scale PCA scores
pca_total_scaled <- scale(pca_total[, (ncol(toxins_clustered_total) + 1):(ncol(toxins_clustered_total) + 2)])
# Step 4: Run k-means clustering for On-Site data (assume 6 clusters for demonstration)
k_on_site <- kmeans(pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], centers = 2, nstart = 25)
# Step 5: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (On-Site):", k_on_site$betweenss / k_on_site$totss, "\n")
## Proportion of variance explained (On-Site): 0.8206545
# Step 6: Visualize clusters for On-Site data
fviz_cluster(list(data = pca_on_site[, ncol(pca_on_site)-1:ncol(pca_on_site)], cluster = k_on_site$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (On-Site Data)")
random_sample_on_site_pca <- pca_on_site_scaled[sample(1:nrow(pca_on_site_scaled), sample_size), ]
# For off-site data
random_sample_off_site_pca <- pca_off_site_scaled[sample(1:nrow(pca_off_site_scaled), sample_size), ]
random_sample_total_pca <- pca_total_scaled[sample(1:nrow(pca_total_scaled), sample_size), ]
# Step 2: Visualize the elbow method using fviz_nbclust (within-cluster sum of squares)
# On-site data
fviz_nbclust(random_sample_on_site_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for On-site Data")
# Off-site data
fviz_nbclust(random_sample_off_site_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Off-site Data")
# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_off_site <- kmeans(pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], centers = 2, nstart = 25)
# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Off-Site):", k_off_site$betweenss / k_off_site$totss, "\n")
## Proportion of variance explained (Off-Site): 0.2787647
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_off_site[, ncol(pca_off_site)-1:ncol(pca_off_site)], cluster = k_off_site$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (Off-Site Data)")
# Total data
fviz_nbclust(random_sample_total_pca, kmeans, method = "silhouette") +
ggtitle("Silhouette Method for Total Data")
# Step 7: Run k-means clustering for Off-Site data (assume 6 clusters for demonstration)
k_total <- kmeans(pca_total[, ncol(pca_total)-1:ncol(pca_total)], centers = 2, nstart = 25)
# Step 8: Calculate the proportion of variance explained by the clustering
cat("Proportion of variance explained (Total):", k_total$betweenss / k_total$totss, "\n")
## Proportion of variance explained (Total): 0.5246632
# Step 9: Visualize clusters for Off-Site data
fviz_cluster(list(data = pca_total[, ncol(pca_total)-1:ncol(pca_total)], cluster = k_total$cluster),
geom = "point",
ellipse.type = "convex",
show.clust.cent = TRUE,
main = "Clusters from PCA (Total Data)")
Totals are very strong predictors.
# Step 1: Load necessary libraries
library(FactoMineR)
library(factoextra)
# Step 2: Prepare the data
mfa_toxins <- toxins_2023
# Step 3: Select variables for MFA (replace indices with appropriate column numbers)
MFAData <- toxins_2023[, c(37, 23, 46, 8, 44, 65, 88, 94, 106, 107, 119)]
# Step 4: Convert categorical variables to factors and numerical to numeric
# Adjust column indices based on the dataset
categorical_cols <- c(1, 2, 3, 4, 5) # Example indices for categorical variables
numerical_cols <- c(6, 7, 8, 9, 10, 11) # Example indices for numerical variables
MFAData[categorical_cols] <- lapply(MFAData[categorical_cols], as.factor)
MFAData[numerical_cols] <- lapply(MFAData[numerical_cols], as.numeric)
# Step 5: Remove rows with missing values
MFAData <- na.omit(MFAData)
# Step 6: Random sampling if dataset is too large
sample_size <- 1000 # Define the desired sample size
if (nrow(MFAData) > sample_size) {
set.seed(123) # Ensure reproducibility
MFAData <- MFAData[sample(1:nrow(MFAData), sample_size), ]
}
# Step 7: Define groups and types for each variable
group_sizes <- rep(1, ncol(MFAData)) # Each variable is its own group
types <- c(rep("n", length(categorical_cols)), rep("s", length(numerical_cols))) # Types for each variable
# Step 8: Run MFA
MFA1 <- MFA(
MFAData,
group = group_sizes, # Each variable treated as a separate group
type = types, # Specify type for each variable
name.group = colnames(MFAData), # Use column names for group names
graph = FALSE # Do not automatically plot graphs
)
# Step 9: Visualizations
# Scree plot: variance explained by dimensions
fviz_screeplot(MFA1, addlabels = TRUE, ylim = c(0, 50))
# Variable contributions to the dimensions
fviz_mfa_var(MFA1,
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Individuals (data points) representation
fviz_mfa_ind(MFA1,
repel = TRUE, # Avoid label overlap
geom = "point",
col.ind = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Variable contributions to the dimensions (by group)
fviz_mfa_var(MFA1, "group",
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))
# Quantitative variables
fviz_mfa_var(MFA1,
choice = "quanti.var", # For quantitative variables
repel = TRUE, # Avoid label overlap
col.var = "cos2", # Color by quality of representation (cos2 indicates the quality)
gradient.cols = c("blue", "green", "red"))